import re
import string
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch
import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/AI/评论情感词提取/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.
PATH = 'F:\\〇\\大三上\\文本挖掘技术\\data\\大作业\\'
def random_colours(number_of_colors):
'''
Simple function for random colours generation.
Input:
number_of_colors - integer value indicating the number of colours which are going to be generated.
Output:
Color in the following format: ['#E86DA4'] .
'''
colors = []
for i in range(number_of_colors):
colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
return colors
train = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')
#ss = pd.read_csv(PATH+'sample_submission.csv')
print(train.shape)
print(test.shape)
train.info()
train.dropna(inplace=True)
test.info()
EDA
train.head()
train.describe()
temp = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Purples')
plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=train)
fig = go.Figure(go.Funnelarea(
text =temp.sentiment,
values = temp.text,
title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
))
fig.show()
def jaccard(str1, str2):
a = set(str1.lower().split())
b = set(str2.lower().split())
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
results_jaccard=[]
for ind,row in train.iterrows():
sentence1 = row.text
sentence2 = row.selected_text
jaccard_score = jaccard(sentence1,sentence2)
results_jaccard.append([sentence1,sentence2,jaccard_score])
jaccard = pd.DataFrame(results_jaccard,columns=["text","selected_text","jaccard_score"])
train = train.merge(jaccard,how='outer')
train['Num_words_ST'] = train['selected_text'].apply(lambda x:len(str(x).split())) #Number Of words in Selected Text
train['Num_word_text'] = train['text'].apply(lambda x:len(str(x).split())) #Number Of words in main text
train['difference_in_words'] = train['Num_word_text'] - train['Num_words_ST'] #Difference in Number of words text and Selected Text
train.head()
hist_data = [train['Num_words_ST'],train['Num_word_text']]
group_labels = ['Selected_Text', 'Text']
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_curve=False)
fig.update_layout(title_text='Distribution of Number Of words')
fig.update_layout(
autosize=False,
width=900,
height=700,
paper_bgcolor="LightSteelBlue",
)
fig.show()
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train['Num_words_ST'], shade=True, color="r").set_title('Kernel Distribution of Number Of words')
p1=sns.kdeplot(train['Num_word_text'], shade=True, color="b")
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train[train['sentiment']=='positive']['difference_in_words'], shade=True, color="b").set_title('Kernel Distribution of Difference in Number Of words')
p2=sns.kdeplot(train[train['sentiment']=='negative']['difference_in_words'], shade=True, color="r")
plt.figure(figsize=(12,6))
sns.distplot(train[train['sentiment']=='neutral']['difference_in_words'],kde=False)
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train[train['sentiment']=='positive']['jaccard_score'], shade=True, color="b").set_title('KDE of Jaccard Scores across different Sentiments')
p2=sns.kdeplot(train[train['sentiment']=='negative']['jaccard_score'], shade=True, color="r")
plt.legend(labels=['positive','negative'])
plt.figure(figsize=(12,6))
sns.distplot(train[train['sentiment']=='neutral']['jaccard_score'],kde=False)
k = train[train['Num_word_text']<=2]
k.groupby('sentiment').mean()['jaccard_score']
k[k['sentiment']=='positive']
清理语料库
在开始从文本和选定文本的单词中提取信息之前,让我们首先清理数据
def clean_text(text):
'''Make text lowercase, remove text in square brackets,remove links,remove punctuation
and remove words containing numbers.'''
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
train['text'] = train['text'].apply(lambda x:clean_text(x))
train['selected_text'] = train['selected_text'].apply(lambda x:clean_text(x))
train.head()
Most Common words in our Target-Selected Text
train['temp_list'] = train['selected_text'].apply(lambda x:str(x).split())
top = Counter([item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Selected Text', orientation='h',
width=700, height=700,color='Common_words')
fig.show()
OOPS!While we cleaned our dataset we didnt remove the stop words and hence we can see the most coomon word is 'to' . Let's try again after removing the stopwords
nltk.download('stopwords')
def remove_stopword(x):
return [y for y in x if y not in stopwords.words('english')]
train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))
top = Counter([item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')
# path=['Common_words']
# fig = px.treemap(temp, path, values=['count'],title='Tree of Most Common Words', color='Common_words')
# fig.show()
Most Common words in Text
train['temp_list1'] = train['text'].apply(lambda x:str(x).split()) #List of words in every row for text
train['temp_list1'] = train['temp_list1'].apply(lambda x:remove_stopword(x)) #Removing Stopwords
top = Counter([item for sublist in train['temp_list1'] for item in sublist])
temp = pd.DataFrame(top.most_common(25))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Text', orientation='h',
width=700, height=700,color='Common_words')
fig.show()
Most common words Sentiments Wise
Positive_sent = train[train['sentiment']=='positive']
Negative_sent = train[train['sentiment']=='negative']
Neutral_sent = train[train['sentiment']=='neutral']
#MosT common positive words
top = Counter([item for sublist in Positive_sent['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(20))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')
fig = px.bar(temp_positive, x="count", y="Common_words", title='Most Commmon Positive Words', orientation='h',
width=700, height=700,color='Common_words')
fig.show()
#MosT common negative words
top = Counter([item for sublist in Negative_sent['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(20))
temp_negative = temp_negative.iloc[1:,:]
temp_negative.columns = ['Common_words','count']
temp_negative.style.background_gradient(cmap='Reds')
# print(path)
# path=['Common_words']
# fig = px.treemap(temp_negative, path, values='count',title='Tree Of Most Common Negative Words')
# fig.show()
#MosT common Neutral words
top = Counter([item for sublist in Neutral_sent['temp_list'] for item in sublist])
temp_neutral = pd.DataFrame(top.most_common(20))
temp_neutral = temp_neutral.loc[1:,:]
temp_neutral.columns = ['Common_words','count']
temp_neutral.style.background_gradient(cmap='Reds')
fig = px.bar(temp_neutral, x="count", y="Common_words", title='Most Commmon Neutral Words', orientation='h',
width=700, height=700,color='Common_words')
fig.show()
# path=['Common_words']
# fig = px.treemap(temp_neutral, path, values='count',title='Tree Of Most Common Neutral Words')
# fig.show()
Let's Look at Unique Words in each Segment
raw_text = [word for word_list in train['temp_list1'] for word in word_list]
def words_unique(sentiment,numwords,raw_words):
'''
Input:
segment - Segment category (ex. 'Neutral');
numwords - how many specific words do you want to see in the final result;
raw_words - list for item in train_data[train_data.segments == segments]['temp_list1']:
Output:
dataframe giving information about the name of the specific ingredient and how many times it occurs in the chosen cuisine (in descending order based on their counts)..
'''
allother = []
for item in train[train.sentiment != sentiment]['temp_list1']:
for word in item:
allother .append(word)
allother = list(set(allother ))
specificnonly = [x for x in raw_text if x not in allother]
mycounter = Counter()
for item in train[train.sentiment == sentiment]['temp_list1']:
for word in item:
mycounter[word] += 1
keep = list(specificnonly)
for word in list(mycounter):
if word not in keep:
del mycounter[word]
Unique_words = pd.DataFrame(mycounter.most_common(numwords), columns = ['words','count'])
return Unique_words
Positive Tweets
Unique_Positive= words_unique('positive', 20, raw_text)
print("The top 20 unique words in Positive Tweets are:")
Unique_Positive.style.background_gradient(cmap='Greens')
# path=['words']
# fig = px.treemap(Unique_Positive, path, values='count',title='Tree Of Unique Positive Words')
# fig.show()
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(Unique_Positive['count'], labels=Unique_Positive.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Positive Words')
plt.show()
Unique_Negative= words_unique('negative', 10, raw_text)
print("The top 10 unique words in Negative Tweets are:")
Unique_Negative.style.background_gradient(cmap='Reds')
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(Unique_Negative['count'], labels=Unique_Negative.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Negative Words')
plt.show()
Unique_Neutral= words_unique('neutral', 10, raw_text)
print("The top 10 unique words in Neutral Tweets are:")
Unique_Neutral.style.background_gradient(cmap='Oranges')
from palettable.colorbrewer.qualitative import Pastel1_7
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(Unique_Neutral['count'], labels=Unique_Neutral.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Neutral Words')
plt.show()
It's Time For WordClouds
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0,16.0), color = 'white',
title = None, title_size=40, image_color=False):
stopwords = set(STOPWORDS)
more_stopwords = {'u', "im"}
stopwords = stopwords.union(more_stopwords)
wordcloud = WordCloud(background_color=color,
stopwords = stopwords,
max_words = max_words,
max_font_size = max_font_size,
random_state = 42,
width=400,
height=200,
mask = mask)
wordcloud.generate(str(text))
plt.figure(figsize=figure_size)
if image_color:
image_colors = ImageColorGenerator(mask);
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
plt.title(title, fontdict={'size': title_size,
'verticalalignment': 'bottom'})
else:
plt.imshow(wordcloud);
plt.title(title, fontdict={'size': title_size, 'color': 'black',
'verticalalignment': 'bottom'})
plt.axis('off');
plt.tight_layout()
d = '/content/drive/MyDrive/masks-wordclouds/'
pos_mask = np.array(Image.open(d+ 'comment.png'))
plot_wordcloud(Neutral_sent.text,mask=pos_mask,color='white',max_font_size=100,title_size=30,title="WordCloud of Neutral Tweets")
plot_wordcloud(Positive_sent.text,mask=pos_mask,title="Word Cloud Of Positive tweets",title_size=30)
plot_wordcloud(Negative_sent.text,mask=pos_mask,title="Word Cloud of Negative Tweets",color='white',title_size=30)
Modelling
1)Modelling the Problem as NER
df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_submission = pd.read_csv(PATH+'sample_submission.csv')
df_train['Num_words_text'] = df_train['text'].apply(lambda x:len(str(x).split())) #Number Of words in main Text in train set
df_train = df_train[df_train['Num_words_text']>=3]
def save_model(output_dir, nlp, new_model_name):
''' This Function Saves model to
given output directory'''
output_dir = f'/content/drive/MyDrive/AI/评论情感词提取/{output_dir}'
if output_dir is not None:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
nlp.meta["name"] = new_model_name
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# pass model = nlp if you want to train on top of existing model
def train(train_data, output_dir, n_iter=50, model=None):
"""Load the model, set up the pipeline and train the entity recognizer."""
""
if model is not None:
nlp = spacy.load(output_dir) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in train_data:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
# sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch
if model is None:
nlp.begin_training()
else:
nlp.resume_training()
for itn in tqdm(range(n_iter)):
random.shuffle(train_data)
batches = minibatch(train_data, size=compounding(4.0, 1000.0, 1.001))
losses = {}
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, # batch of texts
annotations, # batch of annotations
drop=0.9, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
save_model(output_dir, nlp, 'st_ner')
def get_model_out_path(sentiment):
'''
Returns Model output path
'''
model_out_path = None
if sentiment == 'positive':
model_out_path = 'models/model_pos'
elif sentiment == 'negative':
model_out_path = 'models/model_neg'
return model_out_path
def get_training_data(sentiment):
'''
Returns Trainong data in the format needed to train spacy NER
'''
train_data = []
for index, row in df_train.iterrows():
if row.sentiment == sentiment:
selected_text = row.selected_text
text = row.text
start = text.find(selected_text)
end = start + len(selected_text)
train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
return train_data
Training models for Positive and Negative tweets
sentiment = 'positive'
train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
# For DEmo Purposes I have taken 3 iterations you can train the model as you want
train(train_data, model_path, n_iter=50, model=None)
sentiment = 'negative'
train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=50, model=None)
Predicting with the trained Model
def predict_entities(text, model):
doc = model(text)
ent_array = []
for ent in doc.ents:
start = text.find(ent.text)
end = start + len(ent.text)
new_int = [start, end, ent.label_]
if new_int not in ent_array:
ent_array.append([start, end, ent.label_])
selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
return selected_text
selected_texts = []
MODELS_BASE_PATH = '/content/drive/MyDrive/AI/评论情感词提取/models/'
if MODELS_BASE_PATH is not None:
print("Loading Models from ", MODELS_BASE_PATH)
model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
model_neg = spacy.load(MODELS_BASE_PATH + 'model_neg')
for index, row in df_test.iterrows():
text = row.text
output_str = ""
if row.sentiment == 'neutral' or len(text.split()) <= 2:
selected_texts.append(text)
elif row.sentiment == 'positive':
selected_texts.append(predict_entities(text, model_pos))
else:
selected_texts.append(predict_entities(text, model_neg))
df_test['selected_text'] = selected_texts
df_submission['selected_text'] = df_test['selected_text']
df_submission['id'] = df_test['id']
df_submission.to_csv("/content/drive/MyDrive/AI/评论情感词提取/output/DA_n_iter50_drop0.9_bsz4_1000.csv", header=None, index=False, sep=',')
display(df_submission.head(10))